Import and Clean Data
if (!exists("accidents_raw")) {
accidents_raw <- read_csv(here("data", "raw_data", "US_Accidents_March23.csv"), lazy = FALSE)
}
acc <- accidents_raw
# cleanup
acc <- acc %>%
filter(between(`Temperature(F)`, -60, 130)) %>%
mutate(
date_ = date(Start_Time),
year_ = year(Start_Time),
month_ = month(Start_Time),
hour_ = hour(Start_Time),
`Precipitation(in)` = coalesce(`Precipitation(in)`, 0),
any_precip = `Precipitation(in)` > 0,
Weather_Condition = coalesce(Weather_Condition, "Unknown"),
`Temperature(F)` = coalesce(`Temperature(F)`, mean(`Temperature(F)`, na.rm = T))
)
# augmentation
state_lookup <- tibble(State = state.abb, state_name = str_to_title(state.name))
acc <- acc %>%
mutate(
sevg = case_when(
Severity == 1 ~ "least severe",
Severity == 2 ~ "less severe",
Severity == 3 ~ "more severe",
TRUE ~ "most severe"
)
) %>%
inner_join(state_lookup, by = "State")
Descriptive Analysis
summary(acc)
## ID Source Severity
## Length:7546771 Length:7546771 Min. :1.000
## Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Median :2.000
## Mean :2.212
## 3rd Qu.:2.000
## Max. :4.000
##
## Start_Time End_Time Start_Lat
## Min. :2016-01-14 20:18:33 Min. :2016-02-08 06:37:08 Min. :24.55
## 1st Qu.:2018-11-20 16:22:02 1st Qu.:2018-11-20 17:22:44 1st Qu.:33.38
## Median :2020-11-10 08:23:39 Median :2020-11-10 15:11:14 Median :35.80
## Mean :2020-06-02 04:07:56 Mean :2020-06-02 11:34:12 Mean :36.19
## 3rd Qu.:2022-01-19 08:15:20 3rd Qu.:2022-01-19 19:01:21 3rd Qu.:40.11
## Max. :2023-03-31 23:30:00 Max. :2023-03-31 23:59:00 Max. :49.00
##
## Start_Lng End_Lat End_Lng Distance(mi)
## Min. :-124.62 Min. :24.6 Min. :-124.5 Min. : 0.000
## 1st Qu.:-117.22 1st Qu.:33.4 1st Qu.:-117.8 1st Qu.: 0.000
## Median : -87.81 Median :36.1 Median : -88.1 Median : 0.028
## Mean : -94.71 Mean :36.2 Mean : -95.8 Mean : 0.558
## 3rd Qu.: -80.38 3rd Qu.:40.2 3rd Qu.: -80.3 3rd Qu.: 0.460
## Max. : -67.11 Max. :49.1 Max. : -67.1 Max. :441.750
## NA's :3341777 NA's :3341777
## Description Street City County
## Length:7546771 Length:7546771 Length:7546771 Length:7546771
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## State Zipcode Country Timezone
## Length:7546771 Length:7546771 Length:7546771 Length:7546771
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Airport_Code Weather_Timestamp Temperature(F)
## Length:7546771 Min. :2016-01-14 19:51:00 Min. :-58.00
## Class :character 1st Qu.:2018-11-20 16:15:00 1st Qu.: 49.00
## Mode :character Median :2020-11-10 08:30:00 Median : 64.00
## Mean :2020-06-02 04:08:26 Mean : 61.67
## 3rd Qu.:2022-01-19 07:58:00 3rd Qu.: 76.00
## Max. :2023-03-31 23:53:00 Max. :129.20
##
## Wind_Chill(F) Humidity(%) Pressure(in) Visibility(mi)
## Min. :-80.00 Min. : 1.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 43.00 1st Qu.: 48.00 1st Qu.:29.37 1st Qu.: 10.00
## Median : 62.00 Median : 67.00 Median :29.86 Median : 10.00
## Mean : 58.25 Mean : 64.84 Mean :29.54 Mean : 9.09
## 3rd Qu.: 75.00 3rd Qu.: 84.00 3rd Qu.:30.03 3rd Qu.: 10.00
## Max. :128.00 Max. :100.00 Max. :58.63 Max. :140.00
## NA's :1833858 NA's :10278 NA's :7904 NA's :39499
## Wind_Direction Wind_Speed(mph) Precipitation(in) Weather_Condition
## Length:7546771 Min. : 0.00 Min. : 0.000000 Length:7546771
## Class :character 1st Qu.: 4.60 1st Qu.: 0.000000 Class :character
## Mode :character Median : 7.00 Median : 0.000000 Mode :character
## Mean : 7.69 Mean : 0.006127
## 3rd Qu.: 10.40 3rd Qu.: 0.000000
## Max. :1087.00 Max. :36.470000
## NA's :429617
## Amenity Bump Crossing Give_Way
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:7453817 FALSE:7543317 FALSE:6688257 FALSE:7511266
## TRUE :92954 TRUE :3454 TRUE :858514 TRUE :35505
##
##
##
##
## Junction No_Exit Railway Roundabout
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:6990004 FALSE:7527521 FALSE:7481985 FALSE:7546527
## TRUE :556767 TRUE :19250 TRUE :64786 TRUE :244
##
##
##
##
## Station Stop Traffic_Calming Traffic_Signal
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:7348460 FALSE:7337406 FALSE:7539342 FALSE:6424853
## TRUE :198311 TRUE :209365 TRUE :7429 TRUE :1121918
##
##
##
##
## Turning_Loop Sunrise_Sunset Civil_Twilight Nautical_Twilight
## Mode :logical Length:7546771 Length:7546771 Length:7546771
## FALSE:7546771 Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## Astronomical_Twilight date_ year_ month_
## Length:7546771 Min. :2016-01-14 Min. :2016 Min. : 1.0
## Class :character 1st Qu.:2018-11-20 1st Qu.:2018 1st Qu.: 3.0
## Mode :character Median :2020-11-10 Median :2020 Median : 7.0
## Mean :2020-06-01 Mean :2020 Mean : 6.7
## 3rd Qu.:2022-01-19 3rd Qu.:2022 3rd Qu.:10.0
## Max. :2023-03-31 Max. :2023 Max. :12.0
##
## hour_ any_precip sevg state_name
## Min. : 0.00 Mode :logical Length:7546771 Length:7546771
## 1st Qu.: 8.00 FALSE:7016077 Class :character Class :character
## Median :13.00 TRUE :530694 Mode :character Mode :character
## Mean :12.33
## 3rd Qu.:17.00
## Max. :23.00
##
dim(acc)
## [1] 7546771 53
str(acc)
## tibble [7,546,771 × 53] (S3: tbl_df/tbl/data.frame)
## $ ID : chr [1:7546771] "A-1" "A-2" "A-3" "A-4" ...
## $ Source : chr [1:7546771] "Source2" "Source2" "Source2" "Source2" ...
## $ Severity : num [1:7546771] 3 2 2 3 2 3 2 3 2 3 ...
## $ Start_Time : POSIXct[1:7546771], format: "2016-02-08 05:46:00" "2016-02-08 06:07:59" ...
## $ End_Time : POSIXct[1:7546771], format: "2016-02-08 11:00:00" "2016-02-08 06:37:59" ...
## $ Start_Lat : num [1:7546771] 39.9 39.9 39.1 39.7 39.6 ...
## $ Start_Lng : num [1:7546771] -84.1 -82.8 -84 -84.2 -84.2 ...
## $ End_Lat : num [1:7546771] NA NA NA NA NA NA NA NA NA NA ...
## $ End_Lng : num [1:7546771] NA NA NA NA NA NA NA NA NA NA ...
## $ Distance(mi) : num [1:7546771] 0.01 0.01 0.01 0.01 0.01 0.01 0 0.01 0 0.01 ...
## $ Description : chr [1:7546771] "Right lane blocked due to accident on I-70 Eastbound at Exit 41 OH-235 State Route 4." "Accident on Brice Rd at Tussing Rd. Expect delays." "Accident on OH-32 State Route 32 Westbound at Dela Palma Rd. Expect delays." "Accident on I-75 Southbound at Exits 52 52B US-35. Expect delays." ...
## $ Street : chr [1:7546771] "I-70 E" "Brice Rd" "State Route 32" "I-75 S" ...
## $ City : chr [1:7546771] "Dayton" "Reynoldsburg" "Williamsburg" "Dayton" ...
## $ County : chr [1:7546771] "Montgomery" "Franklin" "Clermont" "Montgomery" ...
## $ State : chr [1:7546771] "OH" "OH" "OH" "OH" ...
## $ Zipcode : chr [1:7546771] "45424" "43068-3402" "45176" "45417" ...
## $ Country : chr [1:7546771] "US" "US" "US" "US" ...
## $ Timezone : chr [1:7546771] "US/Eastern" "US/Eastern" "US/Eastern" "US/Eastern" ...
## $ Airport_Code : chr [1:7546771] "KFFO" "KCMH" "KI69" "KDAY" ...
## $ Weather_Timestamp : POSIXct[1:7546771], format: "2016-02-08 05:58:00" "2016-02-08 05:51:00" ...
## $ Temperature(F) : num [1:7546771] 36.9 37.9 36 35.1 36 37.9 34 34 33.3 37.4 ...
## $ Wind_Chill(F) : num [1:7546771] NA NA 33.3 31 33.3 35.5 31 31 NA 33.8 ...
## $ Humidity(%) : num [1:7546771] 91 100 100 96 89 97 100 100 99 100 ...
## $ Pressure(in) : num [1:7546771] 29.7 29.6 29.7 29.6 29.6 ...
## $ Visibility(mi) : num [1:7546771] 10 10 10 9 6 7 7 7 5 3 ...
## $ Wind_Direction : chr [1:7546771] "Calm" "Calm" "SW" "SW" ...
## $ Wind_Speed(mph) : num [1:7546771] NA NA 3.5 4.6 3.5 3.5 3.5 3.5 1.2 4.6 ...
## $ Precipitation(in) : num [1:7546771] 0.02 0 0 0 0 0.03 0 0 0 0.02 ...
## $ Weather_Condition : chr [1:7546771] "Light Rain" "Light Rain" "Overcast" "Mostly Cloudy" ...
## $ Amenity : logi [1:7546771] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Bump : logi [1:7546771] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Crossing : logi [1:7546771] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Give_Way : logi [1:7546771] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Junction : logi [1:7546771] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ No_Exit : logi [1:7546771] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Railway : logi [1:7546771] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Roundabout : logi [1:7546771] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Station : logi [1:7546771] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Stop : logi [1:7546771] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Traffic_Calming : logi [1:7546771] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Traffic_Signal : logi [1:7546771] FALSE FALSE TRUE FALSE TRUE FALSE ...
## $ Turning_Loop : logi [1:7546771] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Sunrise_Sunset : chr [1:7546771] "Night" "Night" "Night" "Night" ...
## $ Civil_Twilight : chr [1:7546771] "Night" "Night" "Night" "Day" ...
## $ Nautical_Twilight : chr [1:7546771] "Night" "Night" "Day" "Day" ...
## $ Astronomical_Twilight: chr [1:7546771] "Night" "Day" "Day" "Day" ...
## $ date_ : Date[1:7546771], format: "2016-02-08" "2016-02-08" ...
## $ year_ : num [1:7546771] 2016 2016 2016 2016 2016 ...
## $ month_ : num [1:7546771] 2 2 2 2 2 2 2 2 2 2 ...
## $ hour_ : int [1:7546771] 5 6 6 7 7 7 7 7 8 8 ...
## $ any_precip : logi [1:7546771] TRUE FALSE FALSE FALSE FALSE TRUE ...
## $ sevg : chr [1:7546771] "more severe" "less severe" "less severe" "more severe" ...
## $ state_name : chr [1:7546771] "Ohio" "Ohio" "Ohio" "Ohio" ...
head(acc)
# ACCIDENT SEVERITY COUNTS
sev_count <- acc %>%
count(sevg)
sev_count
p <- ggplot(sev_count, aes(x = sevg, y = n)) +
geom_col() +
scale_y_continuous(labels = scales::comma) +
labs(
title = "Accident Counts by Severity (2016 - 2023)",
x = "Severity",
y = "Number of Accidents"
)
ggplotly(p)
Geographic Trends
# https://www.census.gov/data/tables/time-series/demo/popest/2020s-state-detail.html
est_pop_state_2023 <- data.frame(
state_name = c(
"Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
"Connecticut", "Delaware", "District of Columbia", "Florida", "Georgia",
"Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky",
"Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota",
"Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire",
"New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota",
"Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Rhode Island",
"South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Vermont",
"Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"
),
pop = c(
5108468, 733406, 7431344, 3067732, 38965193, 5877610, 3617176, 1031890,
678972, 22610726, 11029227, 1435138, 1964726, 12549689, 6862199, 3207004,
2940546, 4526154, 4573749, 1395722, 6180253, 7001399, 10037261, 5737915,
2939690, 6196156, 1132812, 1978379, 3194176, 1402054, 9290841, 2114371,
19571216, 10835491, 783926, 11785935, 4053824, 4233358, 12961683, 1095962,
5373555, 919318, 7126489, 30503301, 3417734, 647464, 8715698, 7812880,
1770071, 5910955, 584057
)
)
states_map <- map_data("state")
# make state names uppercase so they look better on the map
states_map$region <- str_to_title(states_map$region)
acc_state <- acc %>%
group_by(State, state_name) %>%
summarise(
mean_sev = mean(Severity),
n_acc = n(),
groups = "drop"
) %>%
left_join(est_pop_state_2023, by = "state_name") %>%
mutate(
acc_per_100k = 100000 * n_acc / pop
)
head(acc_state %>% arrange(desc(acc_per_100k)))
Average Severity by State
mean_sev_state_bar <- ggplot(
data = acc_state,
aes(
x = reorder(State, -mean_sev),
y = mean_sev
)
) +
geom_col(position = "dodge") +
coord_flip() +
labs(
title = "Average Accident Severity by State (2016 – 2023)",
x = NULL,
y = "Mean Severity (1 = least, 4 = most)"
)
ggplotly(mean_sev_state_bar, tooltip = c("x", "y"))
mean_sev_state_map <- ggChoropleth(
data = acc_state,
aes(fill = mean_sev, map_id = state_name),
map = states_map,
interactive = T,
title = "Average Accident Severity by State (2016 – 2023)"
)
mean_sev_state_map
Total Accidents per State
n_acc_state_map <- ggChoropleth(
data = acc_state,
aes(fill = acc_per_100k, map_id = state_name),
map = states_map,
interactive = T,
title = "Total Accidents Per 100K Residents",
)
n_acc_state_map